In [19]:
# https://www.kaggle.com/mlg-ulb/creditcardfraud
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from imblearn.over_sampling import SMOTE
#plot shit
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
In [2]:
df1 = pd.read_csv("../input/creditcard.csv")
In [3]:
df1.head()
Out[3]:
In [4]:
print(len(df1))
print(df1['Class'].sum())
In [5]:
limit = len(df1)
def plotStrip(x, y, hue, figsize = (14, 9)):
fig = plt.figure(figsize = figsize)
colours = plt.cm.tab10(np.linspace(0, 1, 9))
with sns.axes_style('ticks'):
ax = sns.stripplot(x, y, \
hue = hue, jitter = 0.4, marker = '.', \
size = 4, palette = colours)
ax.set_xlabel('')
ax.set_xticklabels(['genuine', 'fraudulent'], size = 16)
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(2)
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['Transfer', 'Cash out'], bbox_to_anchor=(1, 1), \
loc=2, borderaxespad=0, fontsize = 16);
return ax
In [6]:
X = df1
Y = X['Class']
del X['Class']
In [7]:
print('skew = {}'.format( sum((Y)) / float(len(X)) ))
In [8]:
randomState = 5
np.random.seed(randomState)
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, random_state = randomState)
In [63]:
# I got to this point with mostly manual tuning of XGB
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(eta = 0.3, max_depth = 7, nthread = 6, scale_pos_weight = weights)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
probabilities[:, 1])))
In [62]:
fig = plt.figure(figsize = (14, 9))
ax = fig.add_subplot(111)
colours = plt.cm.Set1(np.linspace(0, 1, 9))
ax = plot_importance(clf, height = 1, color = colours, grid = False, importance_type = 'cover', ax = ax);
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(2)
ax.set_xlabel('importance score', size = 16);
ax.set_ylabel('features', size = 16);
ax.set_yticklabels(ax.get_yticklabels(), size = 12);
ax.set_title('Ordering of features by importance to the model learnt', size = 20);
In [34]:
# Long computation in this cell (~6 minutes)
trainSizes, trainScores, crossValScores = learning_curve(\
XGBClassifier(max_depth = 7, scale_pos_weight = weights, nthread = 4), trainX,\
trainY, scoring = 'average_precision')
In [35]:
trainScoresMean = np.mean(trainScores, axis=1)
trainScoresStd = np.std(trainScores, axis=1)
crossValScoresMean = np.mean(crossValScores, axis=1)
crossValScoresStd = np.std(crossValScores, axis=1)
colours = plt.cm.tab10(np.linspace(0, 1, 9))
fig = plt.figure(figsize = (14, 9))
plt.fill_between(trainSizes, trainScoresMean - trainScoresStd,
trainScoresMean + trainScoresStd, alpha=0.1, color=colours[0])
plt.fill_between(trainSizes, crossValScoresMean - crossValScoresStd,
crossValScoresMean + crossValScoresStd, alpha=0.1, color=colours[1])
plt.plot(trainSizes, trainScores.mean(axis = 1), 'o-', label = 'train', \
color = colours[0])
plt.plot(trainSizes, crossValScores.mean(axis = 1), 'o-', label = 'cross-val', \
color = colours[1])
ax = plt.gca()
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(2)
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['train', 'cross-val'], bbox_to_anchor=(0.8, 0.15), \
loc=2, borderaxespad=0, fontsize = 16);
plt.xlabel('training set size', size = 16);
plt.ylabel('AUPRC', size = 16)
plt.title('Learning curves indicate underfit model', size = 20);
Okay not bad, not great either. I'm going to try smote and see if it gives me better results by oversampling
In [36]:
sm = SMOTE(random_state=12, ratio = 1.0)
x_train_res, y_train_res = sm.fit_sample(trainX, trainY)
In [45]:
#I don't know if other people have this problem or not, but when I use smote it always gives me np arrays instead of dataframes...
trainXnp = trainX.as_matrix()
trainYnp = trainY.as_matrix()
testXnp = testX.as_matrix()
testYnp = testY.as_matrix()
In [38]:
#clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
#clf_rf.fit(x_train_res, y_train_res)
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf_xgb = XGBClassifier(max_depth = 7, scale_pos_weight = weights).fit(x_train_res, y_train_res)
In [43]:
probabilities = clf_xgb.predict_proba(testXnp)
print('AUPRC = {}'.format(average_precision_score(testYnp, \
probabilities[:, 1])))
In [46]:
trainSizes, trainScores, crossValScores = learning_curve(\
XGBClassifier(max_depth = 7, scale_pos_weight = weights, nthread = 4), trainXnp,\
trainYnp, scoring = 'average_precision')
In [47]:
trainScoresMean = np.mean(trainScores, axis=1)
trainScoresStd = np.std(trainScores, axis=1)
crossValScoresMean = np.mean(crossValScores, axis=1)
crossValScoresStd = np.std(crossValScores, axis=1)
colours = plt.cm.tab10(np.linspace(0, 1, 9))
fig = plt.figure(figsize = (14, 9))
plt.fill_between(trainSizes, trainScoresMean - trainScoresStd,
trainScoresMean + trainScoresStd, alpha=0.1, color=colours[0])
plt.fill_between(trainSizes, crossValScoresMean - crossValScoresStd,
crossValScoresMean + crossValScoresStd, alpha=0.1, color=colours[1])
plt.plot(trainSizes, trainScores.mean(axis = 1), 'o-', label = 'train', \
color = colours[0])
plt.plot(trainSizes, crossValScores.mean(axis = 1), 'o-', label = 'cross-val', \
color = colours[1])
ax = plt.gca()
for axis in ['top','bottom','left','right']:
ax.spines[axis].set_linewidth(2)
handles, labels = ax.get_legend_handles_labels()
plt.legend(handles, ['train', 'cross-val'], bbox_to_anchor=(0.8, 0.15), \
loc=2, borderaxespad=0, fontsize = 16);
plt.xlabel('training set size', size = 16);
plt.ylabel('AUPRC', size = 16)
plt.title('Learning curves indicate underfit model', size = 20);
So basically the oversampling + xgboost performed worse than just xgboost. The randomly generated data was probably not perfect and xgboost handled the heavy imbalancing much better than it had any right to. I'm using area under precision recall curve (auprc) instead of the ROC curve because the dataset is heavily skewed. I could get 99.9% accuracy by classifying everything as non-fraudulent but that doesn't tell us jack.
Obviously this isn't the best we can do (it's literally a mildly tuned xgb), but 86% auprc isn't bad for an hour's work and I just wanted to show that I do know the difference between ROC and PR.